# List of required packages
required_packages <- c(
"dplyr",
"ggplot2",
"broom.mixed",
"lme4",
"readr",
"emmeans"
)
# Install missing packages
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load all packages
library(dplyr)
library(ggplot2)
library(broom.mixed)
library(lme4)
library(readr)
library(emmeans)
source(here::here("examples", "colors.R"))
set.seed(123)Application 6: Visualizing Correlations and Models
Looking At Data: Correlations
Correlations are indisposable for understanding the relationship between two variables, but they can be misleading as we show below.
This is adapted directly from Jan Vanhove.
plot_r <- function(df, showSmoother = TRUE, smoother = "lm") {
p <- ggplot(df, aes(x = x, y = y)) +
geom_point(alpha = 0.7)
if(showSmoother) {
p <- p +
geom_smooth(
formula = y ~ x,
method = smoother,
color = colors$green$`500`,
fill = colors$slate$`300`,
alpha = 0.3,
)
}
p <- p +
facet_wrap(~title, scales = "free", ncol = 2) +
theme_minimal(base_size = 12) +
theme(
strip.text = element_text(face = "bold", size = 14),
axis.title = element_blank(),
plot.title = element_text(size = 24, face = "bold", hjust = 0.5),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 10),
panel.spacing = unit(2, "lines")
)
p
}
corr_r <- function(r = 0.6, n = 50) {
compute.y <- function(x, y, r) {
theta <- acos(r)
X <- cbind(x, y)
Xctr <- scale(X, center = TRUE, scale = FALSE) # Centered variables (mean 0)
Id <- diag(n) # Identity matrix
Q <- qr.Q(qr(Xctr[, 1, drop = FALSE])) # QR decomposition
P <- tcrossprod(Q) # Projection onto space defined by x1
x2o <- (Id - P) %*% Xctr[, 2] # x2ctr made orthogonal to x1ctr
Xc2 <- cbind(Xctr[, 1], x2o)
Y <- Xc2 %*% diag(1 / sqrt(colSums(Xc2 ^ 2)))
y <- Y[, 2] + (1 / tan(theta)) * Y[, 1]
return(y)
}
cases <- list(
list(id = 1, title = "(1) Normal x, normal residuals", x = rnorm(n), y = rnorm(n)),
list(id = 2, title = "(2) Uniform x, normal residuals", x = runif(n, 0, 1), y = rnorm(n)),
list(id = 3, title = "(3) +-skewed x, normal residuals", x = rlnorm(n, 5), y = rnorm(n)),
list(id = 4, title = "(4) --skewed x, normal residuals", x = rlnorm(n, 5) * -1 + 5000, y = rnorm(n)),
list(id = 5, title = "(5) Normal x, +-skewed residuals", x = rnorm(n), y = rlnorm(n, 5)),
list(id = 6, title = "(6) Normal x, --skewed residuals", x = rnorm(n), y = -rlnorm(n, 5)),
list(id = 7, title = "(7) Increasing spread",
x = sort(rnorm(n)) + abs(min(rnorm(n))),
y = rnorm(n, 0, sqrt(abs(10 * sort(rnorm(n)))))),
list(id = 8, title = "(8) Decreasing spread",
x = sort(rnorm(n)) + abs(min(rnorm(n))),
y = rnorm(n, 0, sqrt(pmax(0.1, abs(10 * max(sort(rnorm(n))) - 10 * sort(rnorm(n))))))),
list(id = 9, title = "(9) Quadratic trend", x = rnorm(n), y = rnorm(n) ^ 2),
list(id = 10, title = "(10) Sinusoid relationship", x = runif(n, -2 * pi, 2 * pi), y = sin(runif(n, -2 * pi, 2 * pi))),
list(id = 11, title = "(11) A single positive outlier", x = c(rnorm(n - 1), 10), y = c(rnorm(n - 1), 15)),
list(id = 12, title = "(12) A single negative outlier", x = c(rnorm(n - 1), 10), y = c(rnorm(n - 1), -15)),
list(id = 13, title = "(13) Bimodal residuals", x = rnorm(n), y = c(rnorm(floor(n / 2), mean = -3), rnorm(ceiling(n / 2), 3))),
list(id = 14, title = "(14) Two groups",
x = c(rnorm(floor(n / 2), -3), rnorm(ceiling(n / 2), 3)),
y = c(rnorm(floor(n / 2), mean = 3), rnorm(ceiling(n / 2), mean = -3))),
list(id = 15, title = "(15) Sampling at the extremes",
x = c(rnorm(floor(n / 2)), rnorm(ceiling(n / 2), mean = 10)),
y = rnorm(n)),
list(id = 16, title = "(16) Categorical data",
x = sample(1:5, n, replace = TRUE),
y = sample(1:7, n, replace = TRUE))
)
df <- bind_rows(lapply(cases, function(case) {
id = case$id
x <- case$x
y <- compute.y(x, case$y, r)
data.frame(id = id, x = x, y = y, title = case$title)
}))
df$title <- factor(df$title, levels = paste0("(", 1:16, ") ", c(
"Normal x, normal residuals",
"Uniform x, normal residuals",
"+-skewed x, normal residuals",
"--skewed x, normal residuals",
"Normal x, +-skewed residuals",
"Normal x, --skewed residuals",
"Increasing spread",
"Decreasing spread",
"Quadratic trend",
"Sinusoid relationship",
"A single positive outlier",
"A single negative outlier",
"Bimodal residuals",
"Two groups",
"Sampling at the extremes",
"Categorical data"
)))
return(df)
}
data <- corr_r(r=0.3, n=100)
analysis_data <- data |> filter(id == 1)
model <- lm(y ~ x, data = analysis_data)
summary(model)
Call:
lm(formula = y ~ x, data = analysis_data)
Residuals:
Min 1Q Median 3Q Max
-0.19848 -0.07113 -0.00910 0.06042 0.34241
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.00313 0.01015 -0.308 0.75846
x 0.03463 0.01112 3.113 0.00243 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.101 on 98 degrees of freedom
Multiple R-squared: 0.09, Adjusted R-squared: 0.08071
F-statistic: 9.692 on 1 and 98 DF, p-value: 0.002426
cor(analysis_data$x, analysis_data$y)[1] 0.3
plot_r(data, showSmoother=FALSE)plot_r(data, showSmoother=TRUE)Visualizing Model Outputs
Visualization of model outputs is often neglected, but it can be a powerful way both to undrestand and to communicate the results of a model. A number of packages exist to help with this, including broom.mixed, emmeans, and ggeffects.
Here, we fit a logistic mixed-effects model using the glmer() function to estimate the odds of receiving comprehensive postnatal care. This model accounts for both fixed effects (individual-level predictors like race or insurance status) and a random intercept for provider, which captures unobserved heterogeneity across providers.
# Load data
data <- read_csv(here::here("data", "processed", "simulated_data.csv"))Rows: 50000 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): state, self_report_income, edu, race_ethnicity, insurance, job_type
dbl (14): id, provider_id, received_comprehensive_postnatal_care, age, depen...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Sample 1/4 of the sites
set.seed(123)
unique_sites <- unique(data$provider_id)
reduced_sites <- sample(unique_sites, length(unique_sites) * 0.10)
data <- data[data$provider_id %in% reduced_sites, ]
# Fit the model
model <- glmer(
received_comprehensive_postnatal_care ~
race_ethnicity +
log(distance_to_provider) +
insurance +
multiple_gestation +
placenta_previa +
gest_hypertension +
preeclampsia +
(1 | provider_id),
data = data,
family = binomial(link = "logit"),
control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e5))
)
# Create variable labels for better visualization
variable_labels <- c(
"race_ethnicityaian" = "AIAN",
"race_ethnicityasian" = "Asian",
"race_ethnicityblack" = "Black",
"race_ethnicityhispanic" = "Hispanic",
"race_ethnicitynhpi" = "NHPI",
"race_ethnicityother" = "Other",
"log(distance_to_provider)" = "Log(Distance to Provider)",
"insuranceprivate" = "Insurance: Private",
"insurancestate_provided" = "Insurance: State-Provided",
"multiple_gestation" = "Multiple Gestation",
"placenta_previa" = "Placenta Previa",
"gest_hypertension" = "Gestational Hypertension",
"preeclampsia" = "Preeclampsia",
"(Intercept)" = "Intercept"
)Below, we extract the fixed-effect estimates using broom.mixed::tidy(), including 95% confidence intervals, and plot them as a coefficient plot (a forest plot for regression coefficients).
Each point shows the estimated log-odds of receiving comprehensive care associated with a given covariate, holding other variables constant. The dashed vertical line at 0 represents no effect.
# Get fixed effects with confidence intervals
fixed_effects <- tidy(model, conf.int = TRUE) %>%
filter(effect == "fixed") %>%
# Remove any NA values
filter(!is.na(estimate)) %>%
# Create a more readable term name
mutate(term = case_when(
term == "(Intercept)" ~ "Intercept",
term == "age" ~ "Age",
term == "sexMale" ~ "Male Sex",
term == "raceBlack" ~ "Black Race",
term == "raceHispanic" ~ "Hispanic Race",
term == "raceOther" ~ "Other Race",
term == "insurancePrivate" ~ "Private Insurance",
term == "insuranceMedicaid" ~ "Medicaid",
term == "insuranceOther" ~ "Other Insurance",
term == "comorbidity_score" ~ "Comorbidity Score",
term == "emergency_admissionTRUE" ~ "Emergency Admission",
term == "weekend_admissionTRUE" ~ "Weekend Admission",
term == "night_admissionTRUE" ~ "Night Admission",
TRUE ~ term
))
# Create the forest plot
ggplot(fixed_effects, aes(x = estimate, y = reorder(term, estimate))) +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
height = 0.2, color = colors$slate$`300`) +
geom_point(size = 3, color = colors$blue$`500`) +
geom_vline(xintercept = 0, linetype = "dashed", color = colors$red$`600`) +
labs(
title = "Fixed Effects from GLMM",
subtitle = "Each point represents the estimated effect on log-odds of receiving comprehensive care",
x = "Effect Size (95% CI)",
y = "Predictor"
) +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 12),
axis.text.y = element_text(size = 10)
)Predicted Probabilities by Covariate Groups
Here, we use the emmeans package to estimate marginal predicted probabilities for selected covariates. These represent the expected probability of receiving comprehensive care for each level of a covariate, averaging over the distribution of other covariates in the model. This approach helps isolate the relationship between each covariate and the outcome while controlling for confounding. We visualize these estimates and their 95% confidence intervals, grouped by domain (e.g., race/ethnicity, insurance, medical conditions).
# Calculate predicted probabilities for each covariate group
library(emmeans)
# Race/Ethnicity
race_probs <- emmeans(model, ~ race_ethnicity, type = "response")
race_probs_df <- as.data.frame(race_probs) %>%
mutate(
group = "Race/Ethnicity",
category = case_when(
race_ethnicity == "aian" ~ "AIAN",
race_ethnicity == "asian" ~ "Asian",
race_ethnicity == "black" ~ "Black",
race_ethnicity == "hispanic" ~ "Hispanic",
race_ethnicity == "nhpi" ~ "NHPI",
race_ethnicity == "other" ~ "Other",
race_ethnicity == "white" ~ "White"
)
)
# Insurance
insurance_probs <- emmeans(model, ~ insurance, type = "response")
insurance_probs_df <- as.data.frame(insurance_probs) %>%
mutate(
group = "Insurance",
category = case_when(
insurance == "medicaid" ~ "Medicaid",
insurance == "private" ~ "Private",
insurance == "state_provided" ~ "State-Provided",
insurance == "uninsured" ~ "Uninsured",
TRUE ~ "Other"
)
)
# Medical Conditions
conditions_probs <- emmeans(model, ~ multiple_gestation + placenta_previa + gest_hypertension + preeclampsia, type = "response")
conditions_probs_df <- as.data.frame(conditions_probs) %>%
mutate(
group = "Medical Conditions",
category = case_when(
multiple_gestation == 1 ~ "Multiple Gestation",
placenta_previa == 1 ~ "Placenta Previa",
gest_hypertension == 1 ~ "Gestational Hypertension",
preeclampsia == 1 ~ "Preeclampsia",
TRUE ~ "No Medical Conditions"
)
) %>%
# Remove duplicates where multiple conditions are true
distinct(category, .keep_all = TRUE)
# Combine all predictions
all_probs <- bind_rows(
race_probs_df %>% select(group, category, prob, asymp.LCL, asymp.UCL),
insurance_probs_df %>% select(group, category, prob, asymp.LCL, asymp.UCL),
conditions_probs_df %>% select(group, category, prob, asymp.LCL, asymp.UCL)
)
# Create faceted plot
ggplot(all_probs, aes(x = prob, y = reorder(category, prob))) +
geom_errorbarh(aes(xmin = asymp.LCL, xmax = asymp.UCL),
height = 0.2, color = colors$slate$`300`) +
geom_point(size = 3, color = colors$blue$`500`) +
facet_wrap(~ group, scales = "free_y", ncol = 1) +
labs(
title = "Predicted Probability of Receiving Comprehensive Care",
subtitle = "By Covariate Groups",
x = "Predicted Probability (95% CI)",
y = "Category"
) +
scale_x_continuous(labels = scales::percent) +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 12),
axis.text.y = element_text(size = 10),
strip.text = element_text(size = 12, face = "bold")
)Provider-Level Random Effects
We visualize the provider-level random intercepts, which represent the estimated deviation of each provider from the overall mean (intercept) after accounting for the fixed effects in the model. This helps reveal between-provider variability and can identify providers whose outcomes are systematically higher or lower than expected. The dashed vertical line at zero indicates the overall average; providers to the right have higher-than-average effects, and those to the left are lower.
# Get random effects
random_effects <- ranef(model, condVar = TRUE)
random_effects_data <- as.data.frame(random_effects)
# Create the forest plot of random effects
ggplot(random_effects_data, aes(x = condval, y = reorder(grp, condval))) +
geom_errorbarh(aes(xmin = condval - 1.96*sqrt(attr(random_effects$provider_id, "postVar")[1,1,]),
xmax = condval + 1.96*sqrt(attr(random_effects$provider_id, "postVar")[1,1,])),
height = 0.2, color = colors$slate$`300`) +
geom_point(size = 3, color = colors$blue$`500`) +
geom_vline(xintercept = 0, linetype = "dashed", color = colors$red$`600`) +
labs(
title = "Random Effects by Provider",
subtitle = "Each point represents the provider-specific deviation from the overall intercept",
x = "Provider-Specific Effect (95% CI)",
y = "Provider ID"
) +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 12),
axis.text.y = element_text(size = 10)
)# Print model summary
summary(model)Generalized linear mixed model fit by maximum likelihood (Laplace
Approximation) [glmerMod]
Family: binomial ( logit )
Formula:
received_comprehensive_postnatal_care ~ race_ethnicity + log(distance_to_provider) +
insurance + multiple_gestation + placenta_previa + gest_hypertension +
preeclampsia + (1 | provider_id)
Data: data
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))
AIC BIC logLik -2*log(L) df.resid
6183.2 6280.7 -3076.6 6153.2 4913
Scaled residuals:
Min 1Q Median 3Q Max
-1.1303 -0.7272 -0.6137 1.2288 2.3987
Random effects:
Groups Name Variance Std.Dev.
provider_id (Intercept) 0.1589 0.3987
Number of obs: 4928, groups: provider_id, 50
Fixed effects:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.905542 0.391365 -2.314 0.0207 *
race_ethnicityasian 0.248827 0.402795 0.618 0.5367
race_ethnicityblack -0.059232 0.395410 -0.150 0.8809
race_ethnicityhispanic 0.055657 0.388852 0.143 0.8862
race_ethnicitynhpi 0.004071 0.700685 0.006 0.9954
race_ethnicityother 0.279441 0.544121 0.514 0.6076
race_ethnicitywhite 0.196744 0.384417 0.512 0.6088
log(distance_to_provider) -0.023705 0.023838 -0.994 0.3200
insuranceprivate 0.139653 0.074399 1.877 0.0605 .
insurancestate_provided 0.045824 0.082829 0.553 0.5801
multiple_gestation 0.175905 0.178027 0.988 0.3231
placenta_previa 0.309438 0.315725 0.980 0.3270
gest_hypertension 0.035563 0.137519 0.259 0.7959
preeclampsia 0.126002 0.224375 0.562 0.5744
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation matrix not shown by default, as p = 14 > 12.
Use print(x, correlation=TRUE) or
vcov(x) if you need it